Number of Latent States

GLM integration

Pyro

Python

Bayesian Models

GLM

HMM

Looking the best number of latent states for Hidden Markov Models with GLM emissions.

Author

Tommaso Piscitelli

Published

August 10, 2025

Code

import pandas as pd
import numpy as np
from hmmlearn import hmm
import matplotlib.pyplot as plt
from pyprojroot import here
import pyro
import pyro.distributions as dist
import pyro.distributions.constraints as constraints
from pyro.infer import SVI, TraceEnum_ELBO
from pyro.optim import Adam
from scipy.stats import poisson
import torch
import seaborn as sns

data = pd.read_csv(here("data/recent_donations.csv"))
data

# remove columns y_2020 to y_2023
# data = data.drop(columns=["y_2020", "y_2021", "y_2022", "y_2023"])

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[1], line 6
      4 import matplotlib.pyplot as plt
      5 from pyprojroot import here
----> 6 import pyro
      7 import pyro.distributions as dist
      8 import pyro.distributions.constraints as constraints

File c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyro\__init__.py:4
      1 # Copyright (c) 2017-2019 Uber Technologies, Inc.
      2 # SPDX-License-Identifier: Apache-2.0
----> 4 import pyro.poutine as poutine
      5 from pyro.infer.inspect import render_model
      6 from pyro.logger import log

File c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyro\poutine\__init__.py:4
      1 # Copyright (c) 2017-2019 Uber Technologies, Inc.
      2 # SPDX-License-Identifier: Apache-2.0
----> 4 from .handlers import (
      5     block,
      6     broadcast,
      7     collapse,
      8     condition,
      9     do,
     10     enum,
     11     escape,
     12     infer_config,
     13     lift,
     14     markov,
     15     mask,
     16     queue,
     17     reparam,
     18     replay,
     19     scale,
     20     seed,
     21     substitute,
     22     trace,
     23     uncondition,
     24 )
     25 from .messenger import unwrap
     26 from .runtime import NonlocalExit, get_mask

File c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyro\poutine\handlers.py:71
     68 from typing_extensions import ParamSpec
     70 from pyro.poutine import util
---> 71 from pyro.poutine.block_messenger import BlockMessenger
     72 from pyro.poutine.broadcast_messenger import BroadcastMessenger
     73 from pyro.poutine.collapse_messenger import CollapseMessenger

File c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyro\poutine\block_messenger.py:7
      4 from functools import partial
      5 from typing import TYPE_CHECKING, Callable, List, Optional
----> 7 from pyro.poutine.messenger import Messenger
      9 if TYPE_CHECKING:
     10     from pyro.poutine.runtime import Message

File c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyro\poutine\messenger.py:19
      7 from typing import (
      8     Any,
      9     Callable,
   (...)     14     TypeVar,
     15 )
     17 from typing_extensions import ParamSpec, Self
---> 19 from pyro.poutine.runtime import _PYRO_STACK, Message
     21 _P = ParamSpec("_P")
     22 _T = TypeVar("_T")

File c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyro\poutine\runtime.py:19
      4 import functools
      5 from typing import (
      6     TYPE_CHECKING,
      7     Callable,
   (...)     16     overload,
     17 )
---> 19 import torch
     20 from typing_extensions import Literal, ParamSpec, TypedDict
     22 from pyro.params.param_store import (  # noqa: F401
     23     _MODULE_NAMESPACE_DIVIDER,
     24     ParamStoreDict,
     25 )

File c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\__init__.py:148
    146                 err = ctypes.WinError(ctypes.get_last_error())
    147                 err.strerror += f' Error loading "{dll}" or one of its dependencies.'
--> 148                 raise err
    150     kernel32.SetErrorMode(prev_error_mode)
    153 def _preload_cuda_deps(lib_folder, lib_name):

OSError: [WinError 126] Impossibile trovare il modulo specificato. Error loading "c:\Users\tomma\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

Code

# ───────────────────────────────────────────────────────────────
#  Required libraries
# ───────────────────────────────────────────────────────────────
import polars as pl
import numpy as np
import torch

# ───────────────────────────────────────────────────────────────
# 1. Load data
# ───────────────────────────────────────────────────────────────
df = pl.from_pandas(data)                # or pl.read_csv("file.csv")

# ───────────────────────────────────────────────────────────────
# 2. Observed counts   obs  ∈  ℕ^{N×T}
# ───────────────────────────────────────────────────────────────
year_cols = sorted([c for c in df.columns if c.startswith("y_")])
T = len(year_cols)
obs = (df.select(year_cols)
         .fill_null(0)
         .to_numpy()
         .astype(int))                   # (N,T)

# ───────────────────────────────────────────────────────────────
# 3. Fixed covariates (for π)
# ───────────────────────────────────────────────────────────────
df = df.with_columns([
    (pl.col("gender") == "F").cast(pl.Int8).alias("gender_code"),
    ((pl.col("birth_year") - pl.col("birth_year").mean()) /
     pl.col("birth_year").std()).alias("birth_year_norm")
])

birth_year_norm = df["birth_year_norm"].to_numpy()        # (N,)
gender_code     = df["gender_code"].to_numpy()            # (N,)

# π-covariate matrix  (N,2)
cov_init = np.stack([birth_year_norm, gender_code], axis=1)

# ───────────────────────────────────────────────────────────────
# 4. Dynamic covariates (for A)
# ───────────────────────────────────────────────────────────────
years_num  = np.array([int(c[2:]) for c in year_cols])    # [2009, …, 2023]
ages       = years_num[None, :] - df["birth_year"].to_numpy()[:, None]
ages_norm  = (ages - ages.mean()) / ages.std()            # (N,T)

covid_mask = np.isin(years_num, [2020, 2021, 2022]).astype(float)  # (T,)
covid_years = np.tile(covid_mask, (df.height, 1))          # (N,T)

# A-covariate tensor (N,T,2)
cov_tran = np.stack([ages_norm, covid_years], axis=2)

# ───────────────────────────────────────────────────────────────
# 5. Torch tensors
# ───────────────────────────────────────────────────────────────
obs_torch      = torch.tensor(obs,      dtype=torch.long)
cov_init_torch = torch.tensor(cov_init, dtype=torch.float)   # (N,2)
cov_tran_torch = torch.tensor(cov_tran, dtype=torch.float)   # (N,T,2)

# ───────────────────────────────────────────────────────────────
# 6. Emission covariates (for emission GLMs in HMM)
#    (N, T, 4): [birth_year_norm, gender_code, ages_norm, covid_years]
# ───────────────────────────────────────────────────────────────

# Fixed part, repeated along time
# birth_year_norm_tile = np.repeat(birth_year_norm[:, None], T, axis=1)  # (N,T)
gender_code_tile     = np.repeat(gender_code[:, None],     T, axis=1)  # (N,T)

# Assemble emission covariate tensor (N,T,4)
cov_emission = np.stack(
    [gender_code_tile, ages_norm, covid_years],
    axis=2
)  # (N,T,4)

cov_emiss_torch = torch.tensor(cov_emission, dtype=torch.float)

print("Emission covariates:", cov_emiss_torch.shape)   # should be (N,T,4)

print("obs        :", obs_torch.shape)      # (N,T)
print("π covs     :", cov_init_torch.shape) # (N,2)
print("A covs     :", cov_tran_torch.shape) # (N,T,2)

Emission covariates: torch.Size([9236, 15, 3])
obs        : torch.Size([9236, 15])
π covs     : torch.Size([9236, 2])
A covs     : torch.Size([9236, 15, 2])

Code

# ───────────────────────────────────────────────────────────────
#  Required libraries
# ───────────────────────────────────────────────────────────────
import polars as pl
import numpy as np
import torch

# ───────────────────────────────────────────────────────────────
# 1) Load data
# ───────────────────────────────────────────────────────────────
df = pl.from_pandas(data)  # oppure pl.read_csv("file.csv")

# ───────────────────────────────────────────────────────────────
# 2) Observed counts   obs ∈ ℕ^{N×T}
# ───────────────────────────────────────────────────────────────
year_cols = sorted([c for c in df.columns if c.startswith("y_")])
T = len(year_cols)
obs = (
    df.select(year_cols)
      .fill_null(0)
      .to_numpy()
      .astype(int)
)  # (N,T)

# ───────────────────────────────────────────────────────────────
# 3) Fixed covariates (for π)
# ───────────────────────────────────────────────────────────────
df = df.with_columns([
    (pl.col("gender") == "F").cast(pl.Int8).alias("gender_code"),
    ((pl.col("birth_year") - pl.col("birth_year").mean()) /
     pl.col("birth_year").std()).alias("birth_year_norm"),
])

gender_code     = df["gender_code"].to_numpy().astype(float)     # (N,)
birth_year_norm = df["birth_year_norm"].to_numpy().astype(float) # (N,)

# π-covariate matrix (N,2): [birth_year_norm, gender_code]
cov_init = np.stack([birth_year_norm, gender_code], axis=1).astype(np.float32)  # (N,2)

# ───────────────────────────────────────────────────────────────
# 4) Dynamic covariates (for A) — age bins (one-hot) + covid
# ───────────────────────────────────────────────────────────────
years_num  = np.array([int(c[2:]) for c in year_cols], dtype=int)    # [2009, …, 2023]
ages       = years_num[None, :] - df["birth_year"].to_numpy()[:, None]  # (N,T)

# Binning età per transizioni (adatta i break se serve)
age_bins = np.array([18, 26, 36, 46, 56, 61, 66, 200])  # 18–25, 26–35, …, 66+
ages_binned = np.digitize(ages, age_bins, right=False)   # (N,T), in {1..7}
n_agebins = len(age_bins) - 1
# One-hot su float32
ages_onehot = np.eye(n_agebins, dtype=np.float32)[np.clip(ages_binned - 1, 0, n_agebins-1)]  # (N,T,7)

# Covid indicator per anno
covid_mask  = np.isin(years_num, [2020, 2021, 2022]).astype(np.float32)  # (T,)
covid_years = np.tile(covid_mask, (obs.shape[0], 1)).astype(np.float32)  # (N,T)

# A-covariate tensor: [age_onehot(7), covid(1)] → (N,T,8)
cov_tran = np.concatenate([ages_onehot, covid_years[:, :, None]], axis=2).astype(np.float32)  # (N,T,8)

# ───────────────────────────────────────────────────────────────
# 5) Emission covariates (for emission GLMs in HMM)
#     Esempio: [gender(1), age_onehot(7), covid(1)] → (N,T,9)
# ───────────────────────────────────────────────────────────────
gender_code_tile = np.repeat(gender_code[:, None], T, axis=1).astype(np.float32)  # (N,T)

cov_emission = np.concatenate(
    [
        gender_code_tile[:, :, None],   # (N,T,1)
        ages_onehot,                    # (N,T,7)
        covid_years[:, :, None],        # (N,T,1)
        # opzionale: birth_year_norm ripetuto → birth_year_norm[:,None,None] ripetuto su T
        # np.repeat(birth_year_norm[:, None, None], T, axis=1)
    ],
    axis=2
).astype(np.float32)  # (N,T,9)

# ───────────────────────────────────────────────────────────────
# 6) Torch tensors
# ───────────────────────────────────────────────────────────────
obs_torch       = torch.tensor(obs,        dtype=torch.long)   # (N,T)
cov_init_torch  = torch.tensor(cov_init,   dtype=torch.float)  # (N,2)
cov_tran_torch  = torch.tensor(cov_tran,   dtype=torch.float)  # (N,T,8)
cov_emiss_torch = torch.tensor(cov_emission, dtype=torch.float) # (N,T,9)

print("obs        :", obs_torch.shape)        # (N,T)
print("π covs     :", cov_init_torch.shape)   # (N,2)
print("A covs     :", cov_tran_torch.shape)   # (N,T,8)
print("Emission covs:", cov_emiss_torch.shape) # (N,T,9)

# ───────────────────────────────────────────────────────────────
# 7) STRATIFIED SPLIT 90/10 by gender × age-bin (age at t=0)
# ───────────────────────────────────────────────────────────────
N, T = obs_torch.shape
age0 = ages[:, 0]  # (N,)
bins_split = np.array([0, 25, 35, 45, 55, 65, 75, 120])
age_bin = np.digitize(age0, bins_split, right=False)
labels = np.array([f"{int(g)}-{a}" for g, a in zip(gender_code, age_bin)])

rng = np.random.default_rng(42)
indices = np.arange(N)
test_idx = []
for lab in np.unique(labels):
    idx_lab = indices[labels == lab]
    if idx_lab.size == 0:
        continue
    n_test = max(1, int(np.ceil(0.10 * idx_lab.size)))
    pick = rng.choice(idx_lab, size=n_test, replace=False)
    test_idx.append(pick)

test_idx = np.sort(np.concatenate(test_idx))
train_idx = np.setdiff1d(indices, test_idx)

# ───────────────────────────────────────────────────────────────
# 8) Subset train / test
# ───────────────────────────────────────────────────────────────
obs_train = obs_torch[train_idx]
xpi_train = cov_init_torch[train_idx]
xA_train  = cov_tran_torch[train_idx]
cov_train = torch.cat([xpi_train, xA_train[:, -1, :]], dim=1)  # per eventuale GLM vanilla

obs_test = obs_torch[test_idx]
xpi_test = cov_init_torch[test_idx]
xA_test  = cov_tran_torch[test_idx]
cov_test = torch.cat([xpi_test, xA_test[:, -1, :]], dim=1)

cov_emission_train = cov_emiss_torch[train_idx]   # (N_train, T, 9)
cov_emission_test  = cov_emiss_torch[test_idx]    # (N_test,  T, 9)

print(f"N train = {obs_train.shape[0]}, N test = {obs_test.shape[0]}")
print("Emission covariates (train):", cov_emission_train.shape)
print("Emission covariates (test): ", cov_emission_test.shape)

obs        : torch.Size([9236, 15])
π covs     : torch.Size([9236, 2])
A covs     : torch.Size([9236, 15, 8])
Emission covs: torch.Size([9236, 15, 9])
N train = 8308, N test = 928
Emission covariates (train): torch.Size([8308, 15, 9])
Emission covariates (test):  torch.Size([928, 15, 9])

Confronto modello completo con covariate

Code

import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, TraceEnum_ELBO, config_enumerate
from pyro.optim import Adam
import hmm_glm_prediction

Code

# ──────────────────────────────────────────────
# MODELLO
# ──────────────────────────────────────────────
def make_hmm_model_and_guide_cov(K):
    @config_enumerate
    def model(obs, x_pi, x_A, x_em):
        # forza obs a torch.Tensor
        obs = torch.as_tensor(obs)
        N, T = obs.shape
        C_pi = x_pi.shape[1]
        C_A  = x_A.shape[2]
        C_em = x_em.shape[2]

        # Priors
        alpha_pi = 0.5 * torch.ones(K)        # <1 → più “spiky”
        alpha_A  = torch.full((K, K), 0.5)
        alpha_A.fill_diagonal_(6.0)           # forte massa in diagonale
        
        pi_base = pyro.sample("pi_base", dist.Dirichlet(alpha_pi))             # [K]
        A_base  = pyro.sample("A_base",  dist.Dirichlet(alpha_A).to_event(1))  # [K,K]
        
        log_pi_base = pi_base.log()
        log_A_base  = A_base.log()

        # Parametri globali
        W_pi  = pyro.param("W_pi", 0.01 * torch.randn(K, C_pi))
        W_A   = pyro.param("W_A",  0.01 * torch.randn(K, K, C_A))
        beta_em = pyro.param("beta_em", 0.01 * torch.randn(K, C_em + 1))
        
        with pyro.plate("seqs", N):
            # stato iniziale
            logits0 = log_pi_base + (x_pi @ W_pi.T)
            z_prev = pyro.sample("z_0", dist.Categorical(logits=logits0),
                                infer={"enumerate": "parallel"})
            
            log_mu_0 = beta_em[z_prev, 0] + (x_em[:, 0, :] * beta_em[z_prev, 1:]).sum(-1)
            pyro.sample("y_0", dist.Poisson(log_mu_0.exp()), obs=obs[:, 0])

            # transizioni
            for t in range(1, T):
                x_t = x_A[:, t, :]
                
                logitsT = (log_A_base[z_prev] + (W_A[z_prev] * x_t[:, None, :]).sum(-1))
                z_t = pyro.sample(f"z_{t}", dist.Categorical(logits=logitsT),
                                infer={"enumerate": "parallel"})
                
                log_mu_t = beta_em[z_t, 0] + (x_em[:, t, :] * beta_em[z_t, 1:]).sum(-1)
                pyro.sample(f"y_{t}", dist.Poisson(log_mu_t.exp()), obs=obs[:, t])
                
                z_prev = z_t
        
      
    def guide(obs, x_pi, x_A, x_em):
        # forza obs a torch.Tensor
        obs = torch.as_tensor(obs)
        # Parametri MAP per pi e A
        pi_q = pyro.param("pi_base_map",
                        torch.ones(K) / K,
                        constraint=dist.constraints.simplex)

        A_init = torch.eye(K) * (K - 1.) + 1.
        A_init = A_init / A_init.sum(-1, keepdim=True)
        A_q = pyro.param("A_base_map",
                        A_init,
                        constraint=dist.constraints.simplex)

        pyro.sample("pi_base", dist.Delta(pi_q).to_event(1))
        pyro.sample("A_base",  dist.Delta(A_q).to_event(2))

    
    #num_params = K * x_pi.shape[1] + K * K * x_A.shape[2] + K * (x_em.shape[2] + 1) + K + K*K


    return model, guide

Code

@torch.no_grad()
def extract_posterior_point_estimates_cov():
    """
    Legge dal ParamStore i parametri variazionali e restituisce stime puntuali.
    mean_or_mode in {"mean","mode"}.
    """
    
    store = pyro.get_param_store()

    def softmax_row(v):
        e = np.exp(v - np.max(v, axis=-1, keepdims=True))
        return e / e.sum(axis=-1, keepdims=True)

    # 1) Extract learned parameters
    pi_base = pyro.param("pi_base_map").detach().cpu().numpy()      # (K,) simplex
    A_base  = pyro.param("A_base_map").detach().cpu().numpy()       # (K, K) rows on simplex
    W_pi    = pyro.param("W_pi").detach().cpu().numpy()             # (K, C_pi)
    W_A     = pyro.param("W_A").detach().cpu().numpy()              # (K, K, C_A)
    beta_em = pyro.param("beta_em").detach().cpu().numpy()          # (K, 1 + C_em) if intercept first


    # 2) Covariate means
    x_mean_pi = cov_init_torch.mean(dim=0).detach().cpu().numpy()        # (C_pi,)
    x_mean_A  = cov_tran_torch.mean(dim=(0, 1)).detach().cpu().numpy()   # (C_A,)
    x_mean_em = cov_emiss_torch.mean(dim=(0,1)).detach().cpu().numpy()   # (C_em,)

    print("Mean covariates (π):", x_mean_pi)
    print("Mean covariates (A):", x_mean_A)
    print("Mean covariates (emission):", x_mean_em)

    # 3) Mean initial probs, transitions and rates under average covariates
    logits_pi = np.log(pi_base + 1e-30) + W_pi @ x_mean_pi
    pi_mean   = softmax_row(logits_pi[None, :]).ravel()

    K = pi_mean.shape[0]
    A_mean = np.zeros((K, K))
    for k in range(K):
        logits_row = np.log(A_base[k] + 1e-30) + (W_A[k] @ x_mean_A)
        A_mean[k] = softmax_row(logits_row[None, :]).ravel()
   
    rates_mean = np.zeros(K)
    for k in range(K):
        log_mu = beta_em[k, 0] + np.dot(x_mean_em, beta_em[k, 1:])
        rates_mean[k] = np.exp(log_mu)

    # # 4) Names
    # state_names = [f"State {i}" for i in range(K)]

    # # Infer emission coefficient names from beta_em shape
    # C_em = beta_em.shape[1] - 1  # exclude intercept
    # coeff_names = [f"feat{i}" for i in range(C_em)]
    # include_intercept = True

    # # Optional check: ensure consistency with your covariate tensor
    # if 'cov_emiss_torch' in globals():
    #     C_em_from_tensor = int(cov_emiss_torch.shape[-1])
    #     if C_em_from_tensor != C_em:
    #         print(f"Warning: beta_em has {C_em} coefficients, but cov_emission tensor has {C_em_from_tensor} features.")

    # Calcolo rates medi per ciascuno stato

    return pi_mean, A_mean, rates_mean

def evaluate_hmm_glm_prediction(obs_test, xpi_test, xA_test, cov_emission_test):
    """
    Usa i parametri appresi dal ParamStore per fare predizioni sui dati di test
    e calcolare MSE e Accuracy, mostrando anche un grafico.
    """
    import numpy as np
    import matplotlib.pyplot as plt

    store = pyro.get_param_store()

    def softmax_row(v):
        e = np.exp(v - np.max(v, axis=-1, keepdims=True))
        return e / e.sum(axis=-1, keepdims=True)

    # 1) Extract learned parameters
    pi_base = pyro.param("pi_base_map").detach().cpu().numpy()      # (K,)
    A_base  = pyro.param("A_base_map").detach().cpu().numpy()       # (K, K)
    W_pi    = pyro.param("W_pi").detach().cpu().numpy()             # (K, C_pi)
    W_A     = pyro.param("W_A").detach().cpu().numpy()              # (K, K, C_A)
    beta_em = pyro.param("beta_em").detach().cpu().numpy()          # (K, 1 + C_em)

    # 2) Convert test data to NumPy
    obs_test_np          = obs_test.detach().cpu().numpy()
    xpi_test_np          = xpi_test.detach().cpu().numpy()
    xA_test_np           = xA_test.detach().cpu().numpy()
    cov_emission_test_np = cov_emission_test.detach().cpu().numpy()

    # 3) One-step ahead prediction
    y_pred_hmm, state_prob = hmm_glm_prediction.hmm_forward_predict(
        obs_so_far=obs_test_np[:, :-1],
        xpi=xpi_test_np,
        xA=xA_test_np,
        A_base=A_base,
        W_pi=W_pi,
        W_A=W_A,
        pi_base=pi_base,
        beta_em=beta_em,
        cov_emission=cov_emission_test_np,
        steps_ahead=1
    )

    # 4) True values
    y_test = obs_test_np[:, -1]

    # 5) Compute metrics
    mse = np.mean((y_pred_hmm - y_test)**2)
    acc = 100*np.mean(np.round(y_pred_hmm) == y_test)

    print(f"HMM(full): pred mean={y_pred_hmm.mean():.2f}  obs mean={y_test.mean():.2f}")
    print(f"MSE: {mse:.4f}")
    print(f"Accuracy (round): {acc:.2f}%")

    # 6) Plot predictions vs observations
    plt.figure(figsize=(8,4))
    plt.scatter(range(len(y_test)), y_test, label="Observed", alpha=0.7)
    plt.scatter(range(len(y_pred_hmm)), y_pred_hmm, label="Predicted", alpha=0.7)
    plt.title("HMM GLM: One-step ahead predictions")
    plt.xlabel("Sequence index")
    plt.ylabel("Observation")
    plt.legend()
    plt.grid(True)
    plt.show()

    return y_pred_hmm, y_test, mse, acc

def log_softmax_logits(logits, dim=-1):
    return logits - torch.logsumexp(logits, dim=dim, keepdim=True)

@torch.no_grad()
def forward_loglik_cov(obs, x_pi, x_A, x_em):
    device = obs.device
    ps = pyro.get_param_store()
    pi_base = ps["pi_base_map"].to(device)
    A_base  = ps["A_base_map"].to(device)
    W_pi    = ps["W_pi"].to(device)
    W_A     = ps["W_A"].to(device)
    beta_em = ps["beta_em"].to(device)

    N, T = obs.shape
    K = pi_base.shape[0]
    b0 = beta_em[:, 0]
    B  = beta_em[:, 1:]
    log_mu = torch.einsum("ntc,kc->ntk", x_em.to(device), B) + b0.view(1, 1, K)
    emis_log = dist.Poisson(rate=log_mu.exp()).log_prob(obs.unsqueeze(-1))  # (N,T,K)

    log_pi = log_softmax_logits(pi_base.log() + x_pi @ W_pi.T, dim=1)       # (N,K)
    log_alpha = log_pi + emis_log[:, 0]

    log_A0 = A_base.log()
    for t in range(1, T):
        x_t = x_A[:, t, :]
        logits = log_A0.unsqueeze(0) + (W_A.unsqueeze(0) * x_t[:, None, None, :]).sum(-1)
        log_A = log_softmax_logits(logits, dim=2)
        log_alpha = torch.logsumexp(log_alpha.unsqueeze(2) + log_A, dim=1) + emis_log[:, t]
    return torch.logsumexp(log_alpha, dim=1)  # (N,)

Code

def train_and_evaluate_cov(obs_torch, x_pi, x_A, x_em, K_list, n_steps=500, lr=1e-5):
    log_evidences = []
    final_elbos = []   # qui salvo gli ELBO finali per ogni K
    saved_param_files = []  # tengo traccia dei file salvati

    for K in K_list:
        print(f"\n=== Training HMM with K={K} states ===")
        
        # crea modello e guida
        model, guide = make_hmm_model_and_guide_cov(K)

        # resetta ParamStore
        pyro.clear_param_store()

        svi = SVI(model, guide,
                  Adam({"lr": lr}),
                  loss=TraceEnum_ELBO(max_plate_nesting=1))

        losses = []
        for step in range(n_steps):
            loss = svi.step(obs_torch, x_pi, x_A, x_em)
            losses.append(loss)
            if step % 50 == 0:
                print(f"K={K} | step {step:4d}  ELBO = {loss:,.0f}")
        
        # ELBO finale (prendiamo l'ultimo valore di loss, che è -ELBO)
        final_elbo_val = -losses[-1]
        final_elbos.append(final_elbo_val)

        # estrai parametri puntuali
        pi_mean, A_mean, rates_mean = extract_posterior_point_estimates_cov()
        
        params = {
            "pi_hat": torch.tensor(pi_mean, dtype=torch.float32),
            "A_hat": torch.tensor(A_mean, dtype=torch.float32),
            "rates_hat": torch.tensor(rates_mean, dtype=torch.float32)
        }

        evaluate_hmm_glm_prediction(obs_torch, x_pi, x_A, x_em)

        # calcola log-likelihood / evidenza
        log_evidence_val = forward_loglik_cov(obs_torch, x_pi, x_A, x_em).sum()
        log_evidences.append(log_evidence_val)
        print(f"Log-evidence K={K}: {log_evidence_val:.2f}")

        # 🔹 salva i parametri in file
        param_file = f"hmm_glm_params_K{K}.pt"
        pyro.get_param_store().save(param_file)
        saved_param_files.append(param_file)
        print(f"Parametri salvati in {param_file}")

    # plot
    plt.figure(figsize=(10,5))
    plt.plot(K_list, log_evidences, marker='o', label="Log-evidence")
#   plt.plot(K_list, final_elbos, marker='x', label="Final ELBO")
    plt.xlabel("Number of latent states K")
    plt.ylabel("Value")
    plt.title("Model comparison via log-evidence and ELBO")
    plt.legend()
    plt.grid(True)
    plt.show()

    return K_list, log_evidences, final_elbos, saved_param_files

K_list = range(2, 9)
K_list, log_evidences, final_elbos = train_and_evaluate_cov(obs_torch, cov_init_torch, cov_tran_torch, cov_emiss_torch, K_list, n_steps=200, lr=2e-3)

Code

C_pi = cov_init_torch.shape[1]
C_A  = cov_tran_torch.shape[2]
C_em = cov_emiss_torch.shape[2]


K_list = list(range(1, 9))
log_evidences = np.array([
    -174517.90625,
  -140057.25,
  -139246.84375,
  -138848.71875,
  -137796.09375,
  -136732.84375,
  -136524.203125,
  -135943.8125
])

# rendiamo valori positivi
#log_evidences_pos = np.abs(log_evidences)

# calcoliamo num_params per ciascun K
num_params_list = []
for K in K_list:
    num_params = K * C_pi + K * K * C_A + K * (C_em + 1) + K + K*K
    num_params_list.append(num_params)
num_params_list = np.array(num_params_list)

# numero di sequenze
N = 9236

# criterio penalizzato stile BIC
penalized = log_evidences - 0.5 * num_params_list * np.log(N)

# plot
plt.figure(figsize=(10, 5))
plt.plot(K_list, log_evidences, marker='o', label='|Log-evidence|')
plt.plot(K_list, penalized, marker='x', label='Penalized (BIC-like)')
plt.xlabel("Number of latent states K")
plt.ylabel("Value")
plt.title("Positive log-evidence and penalized criterion")
plt.grid(True)
plt.legend()
plt.show()

Confronto modello base

Code

# ------------------------------------------------------------------ #
# FUNZIONE PER ALLENARE HMM CON DIVERSI K                            #
# ------------------------------------------------------------------ #
def train_hmm_models(obs, K_values= [3,5] , n_steps=800, n_inits=3, lr=0.05):
    results = {}  # dict: K -> lista di loss finali per ogni seed
    best_losses = {}  # dict: K -> best loss tra i seed

    for K in K_values:
        print("\n" + "="*50)
        print(f" Allenamento modello con K = {K} stati ")
        print("="*50)

        results[K] = []

        for i in range(n_inits):
            print(f" Seed= {i}")
            pyro.set_rng_seed(i)
            pyro.clear_param_store()

            # ------------------------------
            # MODEL
            # ------------------------------
            def model(obs):
                N, T = obs.shape
                pi = pyro.sample("pi", dist.Dirichlet(torch.ones(K)))   # [K]
                with pyro.plate("row", K):
                    A = pyro.sample("A", dist.Dirichlet(torch.ones(K))) # [K,K]
                rates = pyro.sample("rates",
                                    dist.Gamma(2.*torch.ones(K),
                                            1.*torch.ones(K)).to_event(1))
                with pyro.plate("donor", N):
                    z = pyro.sample("z_0", dist.Categorical(pi),
                                    infer={"enumerate": "parallel"})
                    for t in pyro.markov(range(T)):
                        pyro.sample(f"y_{t}", dist.Poisson(rates[z]), obs=obs[:, t])
                        if t < T-1:
                            z = pyro.sample(f"z_{t+1}", dist.Categorical(A[z]),
                                            infer={"enumerate": "parallel"})

            # ------------------------------
            # GUIDE
            # ------------------------------
            def guide(obs):
                pi_alpha = pyro.param("pi_alpha", torch.ones(K),
                                    constraint=dist.constraints.positive)
                A_alpha = pyro.param("A_alpha", torch.ones(K, K),
                                    constraint=dist.constraints.positive)
                r_alpha = pyro.param("r_alpha", 2.*torch.ones(K),
                                    constraint=dist.constraints.positive)
                r_beta  = pyro.param("r_beta", 1.*torch.ones(K),
                                    constraint=dist.constraints.positive)

                pyro.sample("pi", dist.Dirichlet(pi_alpha))
                with pyro.plate("row", K):
                    pyro.sample("A", dist.Dirichlet(A_alpha))
                pyro.sample("rates", dist.Gamma(r_alpha, r_beta).to_event(1))

            # ------------------------------
            # TRAINING
            # ------------------------------
            svi = SVI(model, guide, Adam({"lr": lr}),
                    loss=TraceEnum_ELBO(max_plate_nesting=1))

            loss = None
            for step in range(n_steps):
                loss = svi.step(obs)
                if step % 100 == 0:
                    print(f"{step:4d}  ELBO = {loss:,.0f}")

            results[K].append(loss)  # salva la loss finale di questo seed

        # migliore loss per questo K
        best_losses[K] = min(results[K])
        print(f"\n --> Migliore loss per K={K}: {best_losses[K]:,.0f}")

    # ------------------------------
    # RIEPILOGO FINALE
    # ------------------------------
    print("\n\nRISULTATI FINALI:")
    best_K = min(best_losses, key=best_losses.get)
    for K in K_values:
        arrow = "  <-- migliore" if K == best_K else ""
        print(f"K = {K:2d} | Loss min su {n_inits} seed = {best_losses[K]:,.0f}{arrow}")

    return best_K, results, best_losses

Code

best_K, results, best_losses = train_hmm_models(obs_torch, K_values=[3,5], n_inits=3, n_steps=500, lr=0.05)


==================================================
 Allenamento modello con K = 3 stati 
==================================================
 Seed= 0
   0  ELBO = 249,663
 100  ELBO = 154,770
 200  ELBO = 151,826
 300  ELBO = 143,834
 400  ELBO = 134,269
 Seed= 1
   0  ELBO = 224,908
 100  ELBO = 152,509
 200  ELBO = 163,749
 300  ELBO = 139,743
 400  ELBO = 135,251
 Seed= 2
   0  ELBO = 223,440
 100  ELBO = 151,726
 200  ELBO = 140,244
 300  ELBO = 135,001
 400  ELBO = 138,583

 --> Migliore loss per K=3: 131,856

==================================================
 Allenamento modello con K = 5 stati 
==================================================
 Seed= 0
   0  ELBO = 217,081
 100  ELBO = 146,315
 200  ELBO = 140,188
 300  ELBO = 142,657
 400  ELBO = 132,561
 Seed= 1
   0  ELBO = 189,444
 100  ELBO = 142,120
 200  ELBO = 134,412
 300  ELBO = 133,304
 400  ELBO = 133,632
 Seed= 2
   0  ELBO = 194,773
 100  ELBO = 146,349
 200  ELBO = 133,447
 300  ELBO = 134,405
 400  ELBO = 132,991

 --> Migliore loss per K=5: 130,454


RISULTATI FINALI:
K =  3 | Loss min su 3 seed = 131,856
K =  5 | Loss min su 3 seed = 130,454  <-- migliore

Code

# alleno di nuovo il modello migliore
pyro.clear_param_store()
pyro.set_rng_seed(0)  # oppure il seed che aveva dato la loss minima

def make_hmm_model_and_guide(K):
    def model(obs):
        N, T = obs.shape
        pi = pyro.sample("pi", dist.Dirichlet(torch.ones(K)))   # [K]
        with pyro.plate("row", K):
            A = pyro.sample("A", dist.Dirichlet(torch.ones(K))) # [K,K]
        rates = pyro.sample("rates",
                            dist.Gamma(2.*torch.ones(K),
                                       1.*torch.ones(K)).to_event(1))
        with pyro.plate("donor", N):
            z = pyro.sample("z_0", dist.Categorical(pi),
                            infer={"enumerate": "parallel"})
            for t in pyro.markov(range(T)):
                pyro.sample(f"y_{t}", dist.Poisson(rates[z]), obs=obs[:, t])
                if t < T-1:
                    z = pyro.sample(f"z_{t+1}", dist.Categorical(A[z]),
                                    infer={"enumerate": "parallel"})

    def guide(obs):
        pi_alpha = pyro.param("pi_alpha", torch.ones(K),
                              constraint=dist.constraints.positive)
        A_alpha = pyro.param("A_alpha", torch.ones(K, K),
                             constraint=dist.constraints.positive)
        r_alpha = pyro.param("r_alpha", 2.*torch.ones(K),
                             constraint=dist.constraints.positive)
        r_beta  = pyro.param("r_beta", 1.*torch.ones(K),
                             constraint=dist.constraints.positive)

        pyro.sample("pi", dist.Dirichlet(pi_alpha))
        with pyro.plate("row", K):
            pyro.sample("A", dist.Dirichlet(A_alpha))
        pyro.sample("rates", dist.Gamma(r_alpha, r_beta).to_event(1))

    num_params = K + K*K + 2*K    

    return model, guide, num_params

Code


@torch.no_grad()
def _dirichlet_point(alpha, mode=False, eps=1e-30):
    """
    alpha: (..., K)
    mean: alpha / alpha.sum(-1)
    mode: (alpha-1)/(sum(alpha)-K) se alpha_i>1 altrimenti fallback al mean
    """
    if not mode:
        p = alpha / alpha.sum(-1, keepdim=True)
        return torch.clamp(p, eps, 1.0).to(alpha.dtype)
    # mode
    K = alpha.size(-1)
    num = torch.clamp(alpha - 1.0, min=eps)
    den = torch.clamp(alpha.sum(-1, keepdim=True) - K, min=eps)
    p_mode = num / den
    # se qualche alpha<=1, fallback al mean per stabilità
    need_mean = (alpha <= 1.0).any(dim=-1, keepdim=True)
    p_mean = alpha / alpha.sum(-1, keepdim=True)
    p = torch.where(need_mean, p_mean, p_mode)
    return torch.clamp(p, eps, 1.0).to(alpha.dtype)

@torch.no_grad()
def _gamma_point(alpha, beta, mode=False, eps=1e-30):
    """
    Pyro Gamma(concentration=alpha, rate=beta)
    mean = alpha / beta
    mode = (alpha-1)/beta se alpha>1 altrimenti mean
    """
    if not mode:
        r = alpha / torch.clamp(beta, eps)
        return torch.clamp(r, eps)
    mode_ok = alpha > 1.0
    r_mode = torch.clamp((alpha - 1.0) / torch.clamp(beta, eps), eps)
    r_mean = torch.clamp(alpha / torch.clamp(beta, eps), eps)
    return torch.where(mode_ok, r_mode, r_mean)

@torch.no_grad()
def extract_posterior_point_estimates(mean_or_mode="mean"):
    """
    Legge dal ParamStore i parametri variazionali e restituisce stime puntuali.
    mean_or_mode in {"mean","mode"}.
    """
    store = pyro.get_param_store()
    for name in ["pi_alpha", "A_alpha", "r_alpha", "r_beta"]:
        if name not in store:
            raise KeyError(f"Parametro '{name}' assente nel ParamStore. Chiavi: {sorted(store.keys())}")

    pi_alpha = pyro.param("pi_alpha")     # (K,)
    A_alpha  = pyro.param("A_alpha")      # (K,K)
    r_alpha  = pyro.param("r_alpha")      # (K,)
    r_beta   = pyro.param("r_beta")       # (K,)

    use_mode = (mean_or_mode == "mode")
    pi_hat = _dirichlet_point(pi_alpha, mode=use_mode)        # (K,)
    A_hat  = _dirichlet_point(A_alpha,  mode=use_mode)        # (K,K) righe sommano a 1
    rates_hat = _gamma_point(r_alpha, r_beta, mode=use_mode)  # (K,)

    return pi_hat, A_hat, rates_hat

Code

# function to calculate the log likeliood using the forward algorithm
def log_evidence(params, obs):
    """
    Calcola log p(obs) usando il forward algorithm
    con i parametri appresi (params). E' diverso da 
    EM nel quale i parametri sono aggiornati, qui i parametri sono fissi.
    """
    with torch.no_grad():
        K = params["pi_hat"].numel()
        
        pi  = dist.Dirichlet(params["pi_hat"]).mean          # [K]
        A   = dist.Dirichlet(params["A_hat"]).mean           # [K,K]
        lam = params["rates_hat"]            # [K]

        log_pi = pi.log()                          # log iniziale
        log_A  = (A / A.sum(1, keepdim=True)).log()             # log matrice transizione
        emis   = dist.Poisson(lam).log_prob(obs.unsqueeze(-1))  # (N,T,K) emission log-prob

        N, T = obs.shape
        log_alpha = log_pi + emis[:, 0]                          # inizializzazione

        for t in range(1, T):
            # forward update: logsumexp su dimensione degli stati precedenti
            log_alpha = (log_alpha.unsqueeze(2) + log_A).logsumexp(1) + emis[:, t]

        # log-likelihood sequenze (somma su sequenze)
        total_ll = log_alpha.logsumexp(1).sum().item()
        return total_ll

Code

def train_and_evaluate(obs_torch, K_list, n_steps=500, lr=2e-3):
    log_evidences = []

    for K in K_list:
        print(f"\n=== Training HMM with K={K} states ===")
        
        # crea modello e guida
        model, guide = make_hmm_model_and_guide(K)

        # resetta ParamStore
        pyro.clear_param_store()

        # crea SVI
        svi = SVI(model, guide, Adam({"lr": lr}),
                  loss=TraceEnum_ELBO(max_plate_nesting=1))

        # training
        loss = None
        for step in range(n_steps):
            loss = svi.step(obs_torch)
            if step % 10 == 0:
                print(f"{step:4d}  ELBO = {loss:,.0f}")

        # estrai parametri puntuali
        pi_hat, A_hat, rates_hat = extract_posterior_point_estimates(mean_or_mode="mean")
        params = {
            "pi_hat": pi_hat,
            "A_hat": A_hat,
            "rates_hat": rates_hat
        }

        # calcola log-likelihood / evidenza
        log_evidence_val = log_evidence(params, obs_torch)
        log_evidences.append(log_evidence_val)
        print(f"Log-evidence K={K}: {log_evidence_val:.2f}")

    # plot
    plt.figure(figsize=(8,5))
    plt.plot(K_list, log_evidences, marker='o')
    plt.xlabel("Number of latent states K")
    plt.ylabel("Log-evidence")
    plt.title("Model comparison via log-evidence")
    plt.grid(True)
    plt.show()

    return K_list, log_evidences



K_list = range(2, 8)
train_and_evaluate(obs_torch, K_list, n_steps=500, lr=2e-3)


=== Training HMM with K=2 states ===
   0  ELBO = 190,232
  10  ELBO = 193,415
  20  ELBO = 159,915
  30  ELBO = 375,032
  40  ELBO = 175,100
  50  ELBO = 210,764
  60  ELBO = 178,541
  70  ELBO = 172,792
  80  ELBO = 236,583
  90  ELBO = 227,445
 100  ELBO = 316,513
 110  ELBO = 184,105
 120  ELBO = 171,063
 130  ELBO = 203,254
 140  ELBO = 186,968
 150  ELBO = 192,389
 160  ELBO = 203,597
 170  ELBO = 174,310
 180  ELBO = 202,074
 190  ELBO = 165,841
 200  ELBO = 172,105
 210  ELBO = 206,842
 220  ELBO = 178,325
 230  ELBO = 178,878
 240  ELBO = 228,023
 250  ELBO = 206,081
 260  ELBO = 222,080
 270  ELBO = 153,061
 280  ELBO = 204,136
 290  ELBO = 166,776
 300  ELBO = 197,441
 310  ELBO = 166,890
 320  ELBO = 167,722
 330  ELBO = 194,650
 340  ELBO = 180,926
 350  ELBO = 185,720
 360  ELBO = 176,313
 370  ELBO = 194,673
 380  ELBO = 162,984
 390  ELBO = 167,645
 400  ELBO = 142,858
 410  ELBO = 284,911
 420  ELBO = 168,510
 430  ELBO = 171,370
 440  ELBO = 283,396
 450  ELBO = 172,392
 460  ELBO = 205,640
 470  ELBO = 219,136
 480  ELBO = 198,483
 490  ELBO = 163,924
Log-evidence K=2: -176548.53

=== Training HMM with K=3 states ===
   0  ELBO = 247,371
  10  ELBO = 172,018
  20  ELBO = 193,899
  30  ELBO = 227,453
  40  ELBO = 191,811
  50  ELBO = 298,896
  60  ELBO = 200,825
  70  ELBO = 178,750
  80  ELBO = 199,435
  90  ELBO = 178,607
 100  ELBO = 243,646
 110  ELBO = 178,217
 120  ELBO = 182,281
 130  ELBO = 172,379
 140  ELBO = 166,560
 150  ELBO = 182,253
 160  ELBO = 267,153
 170  ELBO = 220,679
 180  ELBO = 178,813
 190  ELBO = 200,628
 200  ELBO = 175,836
 210  ELBO = 178,554
 220  ELBO = 181,986
 230  ELBO = 172,293
 240  ELBO = 173,313
 250  ELBO = 178,147
 260  ELBO = 173,142
 270  ELBO = 198,315
 280  ELBO = 166,034
 290  ELBO = 186,686
 300  ELBO = 218,927
 310  ELBO = 177,696
 320  ELBO = 173,187
 330  ELBO = 173,021
 340  ELBO = 169,031
 350  ELBO = 184,694
 360  ELBO = 173,284
 370  ELBO = 174,705
 380  ELBO = 169,267
 390  ELBO = 165,636
 400  ELBO = 190,386
 410  ELBO = 178,986
 420  ELBO = 173,631
 430  ELBO = 171,541
 440  ELBO = 168,006
 450  ELBO = 166,954
 460  ELBO = 147,162
 470  ELBO = 163,471
 480  ELBO = 154,490
 490  ELBO = 190,566
Log-evidence K=3: -176235.47

=== Training HMM with K=4 states ===
   0  ELBO = 240,320
  10  ELBO = 312,726
  20  ELBO = 171,498
  30  ELBO = 185,044
  40  ELBO = 210,946
  50  ELBO = 215,290
  60  ELBO = 196,785
  70  ELBO = 251,274
  80  ELBO = 235,890
  90  ELBO = 167,417
 100  ELBO = 181,020
 110  ELBO = 200,026
 120  ELBO = 193,995
 130  ELBO = 173,299
 140  ELBO = 179,895
 150  ELBO = 161,756
 160  ELBO = 161,841
 170  ELBO = 172,566
 180  ELBO = 274,967
 190  ELBO = 186,367
 200  ELBO = 177,715
 210  ELBO = 176,197
 220  ELBO = 167,204
 230  ELBO = 193,153
 240  ELBO = 215,687
 250  ELBO = 176,218
 260  ELBO = 171,713
 270  ELBO = 165,249
 280  ELBO = 161,930
 290  ELBO = 166,931
 300  ELBO = 167,066
 310  ELBO = 172,889
 320  ELBO = 177,175
 330  ELBO = 175,511
 340  ELBO = 175,998
 350  ELBO = 184,089
 360  ELBO = 190,777
 370  ELBO = 162,425
 380  ELBO = 168,862
 390  ELBO = 177,483
 400  ELBO = 183,929
 410  ELBO = 182,565
 420  ELBO = 168,867
 430  ELBO = 187,491
 440  ELBO = 204,556
 450  ELBO = 155,868
 460  ELBO = 180,376
 470  ELBO = 229,833
 480  ELBO = 171,871
 490  ELBO = 184,568
Log-evidence K=4: -175584.80

=== Training HMM with K=5 states ===
   0  ELBO = 301,775
  10  ELBO = 265,383
  20  ELBO = 168,610
  30  ELBO = 196,518
  40  ELBO = 268,005
  50  ELBO = 287,431
  60  ELBO = 181,713
  70  ELBO = 208,820
  80  ELBO = 173,086
  90  ELBO = 200,453
 100  ELBO = 164,948
 110  ELBO = 230,225
 120  ELBO = 189,927
 130  ELBO = 180,045
 140  ELBO = 188,618
 150  ELBO = 205,706
 160  ELBO = 174,525
 170  ELBO = 170,300
 180  ELBO = 234,548
 190  ELBO = 174,461
 200  ELBO = 183,315
 210  ELBO = 188,289
 220  ELBO = 174,339
 230  ELBO = 167,151
 240  ELBO = 182,117
 250  ELBO = 205,801
 260  ELBO = 161,744
 270  ELBO = 177,010
 280  ELBO = 166,169
 290  ELBO = 188,263
 300  ELBO = 177,360
 310  ELBO = 173,780
 320  ELBO = 157,469
 330  ELBO = 172,907
 340  ELBO = 174,215
 350  ELBO = 199,236
 360  ELBO = 178,875
 370  ELBO = 166,361
 380  ELBO = 171,062
 390  ELBO = 164,785
 400  ELBO = 184,789
 410  ELBO = 188,330
 420  ELBO = 173,451
 430  ELBO = 170,524
 440  ELBO = 158,213
 450  ELBO = 160,903
 460  ELBO = 169,181
 470  ELBO = 167,878
 480  ELBO = 190,540
 490  ELBO = 181,442
Log-evidence K=5: -176189.83

=== Training HMM with K=6 states ===
   0  ELBO = 205,246
  10  ELBO = 204,992
  20  ELBO = 178,281
  30  ELBO = 224,585
  40  ELBO = 184,819
  50  ELBO = 193,440
  60  ELBO = 262,086
  70  ELBO = 193,501
  80  ELBO = 179,400
  90  ELBO = 193,507
 100  ELBO = 165,328
 110  ELBO = 167,362
 120  ELBO = 191,785
 130  ELBO = 196,277
 140  ELBO = 177,968
 150  ELBO = 201,942
 160  ELBO = 168,274
 170  ELBO = 177,236
 180  ELBO = 197,133
 190  ELBO = 170,824
 200  ELBO = 170,287
 210  ELBO = 170,944
 220  ELBO = 168,661
 230  ELBO = 167,795
 240  ELBO = 174,382
 250  ELBO = 166,777
 260  ELBO = 186,454
 270  ELBO = 151,819
 280  ELBO = 167,925
 290  ELBO = 169,870
 300  ELBO = 174,078
 310  ELBO = 171,096
 320  ELBO = 169,600
 330  ELBO = 171,617
 340  ELBO = 170,151
 350  ELBO = 175,450
 360  ELBO = 172,908
 370  ELBO = 166,262
 380  ELBO = 170,416
 390  ELBO = 171,710
 400  ELBO = 164,363
 410  ELBO = 178,430
 420  ELBO = 172,183
 430  ELBO = 155,204
 440  ELBO = 170,694
 450  ELBO = 170,051
 460  ELBO = 158,892
 470  ELBO = 208,772
 480  ELBO = 168,031
 490  ELBO = 174,782
Log-evidence K=6: -175024.06

=== Training HMM with K=7 states ===
   0  ELBO = 219,438
  10  ELBO = 196,574
  20  ELBO = 202,997
  30  ELBO = 194,856
  40  ELBO = 270,244
  50  ELBO = 241,503
  60  ELBO = 214,216
  70  ELBO = 172,002
  80  ELBO = 172,319
  90  ELBO = 177,734
 100  ELBO = 170,084
 110  ELBO = 262,432
 120  ELBO = 190,945
 130  ELBO = 211,321
 140  ELBO = 166,894
 150  ELBO = 210,115
 160  ELBO = 187,824
 170  ELBO = 173,195
 180  ELBO = 173,743
 190  ELBO = 205,510
 200  ELBO = 189,702
 210  ELBO = 200,941
 220  ELBO = 181,681
 230  ELBO = 218,355
 240  ELBO = 173,864
 250  ELBO = 170,276
 260  ELBO = 171,114
 270  ELBO = 174,103
 280  ELBO = 185,191
 290  ELBO = 164,321
 300  ELBO = 193,377
 310  ELBO = 177,413
 320  ELBO = 181,125
 330  ELBO = 166,970
 340  ELBO = 179,617
 350  ELBO = 178,753
 360  ELBO = 177,921
 370  ELBO = 166,222
 380  ELBO = 174,462
 390  ELBO = 163,592
 400  ELBO = 178,974
 410  ELBO = 174,073
 420  ELBO = 164,714
 430  ELBO = 179,232
 440  ELBO = 165,262
 450  ELBO = 177,316
 460  ELBO = 169,859
 470  ELBO = 176,121
 480  ELBO = 166,555
 490  ELBO = 168,900
Log-evidence K=7: -174293.88

(range(2, 8),
 [-176548.53125,
  -176235.46875,
  -175584.796875,
  -176189.828125,
  -175024.0625,
  -174293.875])

Code

K_list = list(range(2, 8))
log_evidences = [-176548.53125,
                 -176235.46875,
                 -175584.796875,
                 -176189.828125,
                 -175024.0625,
                 -174293.875]

# rendiamo valori positivi
log_evidences_pos = np.abs(log_evidences)

# numero di parametri per ciascun K
# esempio: pi: K, A: K*K, rates: K => num_params = K + K*K + K = K^2 + 2K
num_params = [K + K**2 + 2*K for K in K_list]

N = 9300
penalized = log_evidences_pos - 0.5 * np.array(num_params) * np.log(N)

plt.figure(figsize=(10,5))

plt.plot(K_list, log_evidences_pos, marker='o', label='|Log-evidence|')
plt.plot(K_list, penalized, marker='x', label='Penalized (BIC-like)')
plt.xlabel("Number of latent states K")
plt.ylabel("Value")
plt.title("Positive log-evidence and penalized criterion")
plt.grid(True)
plt.legend()
plt.show()